library(tidyverse)
library(tidymodels)
library(tidytext)
library(plotly)
library(ggpubr)
library(GGally)
library(embed)
library(here)tracks_summary_file <- here("data", "processed", "tracks_summary.tsv")
tracks_summary_df <- read_tsv(tracks_summary_file, show_col_types = FALSE) %>%
mutate(
across(contains("id"), as.character),
divided = as.logical(divided),
across(contains("filamented_"), ~factor(
.x,
levels = c(FALSE, TRUE),
labels = c("Not filamented", "Filamented")
)
),
across(contains("survived_"), ~factor(
.x,
levels = c(FALSE, TRUE),
labels = c("Not survived", "Survived")
)
),
cell_status = paste0(filamented_track, " - ", survived_strict) %>%
factor()
) %>%
relocate(where(is.character), where(is.factor), where(is.logical)) %>%
filter(initial_time <= centered_antibiotic_start_time, end_time > centered_antibiotic_start_time) %>%
glimpse()## Rows: 13,524
## Columns: 28
## $ experiment_id <chr> "Chromosome", "Chromosome"…
## $ id <chr> "xy01_1_10.008-34.000", "x…
## $ filamented_track <fct> Filamented, Filamented, Fi…
## $ survived_strict <fct> Survived, Survived, Not su…
## $ survived_soft <fct> Survived, Survived, Surviv…
## $ cell_status <fct> Filamented - Survived, Fil…
## $ exists_before_experiment <lgl> TRUE, TRUE, FALSE, FALSE, …
## $ exists_before_or_during_experiment <lgl> TRUE, TRUE, TRUE, TRUE, TR…
## $ exists_after_experiment <lgl> TRUE, TRUE, TRUE, TRUE, TR…
## $ divided <lgl> TRUE, TRUE, TRUE, TRUE, TR…
## $ initial_time <dbl> 0, 10, 60, 60, 0, 0, 0, 0,…
## $ end_time <dbl> 240, 160, 170, 240, 170, 1…
## $ lived_time <dbl> 240, 150, 110, 180, 170, 1…
## $ centered_antibiotic_start_time <dbl> 60, 60, 60, 60, 60, 60, 60…
## $ centered_antibiotic_end_time <dbl> 100, 100, 100, 100, 100, 1…
## $ sos_at_time <dbl> 20, 10, 50, 40, 30, 70, 70…
## $ dead_at_time <dbl> NA, 170, 180, NA, 180, 170…
## $ initial_length <dbl> 25.84965, 41.41517, 30.751…
## $ initial_gfp <dbl> 0.9977303, 0.9980406, 0.99…
## $ initial_ds_red <dbl> 2.987506, 2.987671, 3.0091…
## $ end_length <dbl> 56.78846, 134.41667, 107.4…
## $ end_gfp <dbl> 0.9979626, 0.9963134, 0.99…
## $ end_ds_red <dbl> 2.983168, 2.980438, 2.9792…
## $ sos_length <dbl> 41.41517, 41.41517, 49.549…
## $ sos_gfp <dbl> 0.9980406, 0.9980406, 0.99…
## $ sos_ds_red <dbl> 2.987671, 2.987671, 3.0841…
## $ n_divisons <dbl> 7, 1, 1, 7, 4, 2, 3, 5, 1,…
## $ time_since_last_divison_to_experiment_start <dbl> 50, 50, NA, NA, 10, 50, 50…
theme_set(
theme_bw() +
theme(legend.position = "top")
)parse_metrics_column <- function(.data, metric_column) {
.data %>%
mutate(
{{ metric_column }} := str_remove(
string = {{ metric_column }},
pattern = "_(.+)"
) %>%
factor(
levels = c("initial", "sos", "end"),
labels = c("Initial", "SOS", "End")
)
)
}tracks_summary_df %>%
count(experiment_id, cell_status) %>%
group_by(experiment_id) %>%
mutate(
percentage = n / sum(n) * 100,
ymax = cumsum(percentage),
ymin = c(0, head(ymax, -1)),
labels = paste0(format(percentage, digits = 2), "%"),
labels_position = (ymax + ymin) / 2,
total_label = paste0("Total:\n", format(sum(n), big.mark = ","), " cells")
) %>%
ungroup() %>%
identity() %>%
ggplot(
aes(
ymin = ymin,
ymax = ymax,
xmin = 3,
xmax = 4
)
) +
geom_rect(
size = 1.5,
color = "white",
aes(fill = cell_status)
) +
geom_label(
x = 2,
aes(
y = labels_position,
label = labels
),
label.size = NA,
size = 3.5
) +
geom_text(aes(x = -Inf, y = -Inf, label = total_label), hjust = 0.5, vjust = 0.5) +
facet_grid(. ~ experiment_id) +
coord_polar(theta = "y") +
xlim(c(-1, 4)) +
guides(
fill = guide_legend(ncol = 2)
) +
theme_void() +
theme(
legend.position = "bottom"
) +
labs(
fill = "Cell status"
) +
NULLtracks_summary_df %>%
select(-sos_gfp) %>%
pivot_longer(
cols = contains("gfp"),
names_to = "metric",
values_to = "value"
) %>%
parse_metrics_column(metric) %>%
filter(!is.na(value)) %>%
ggplot(aes(x = value, color = cell_status, fill = cell_status)) +
geom_density(aes(y = ..scaled..), alpha = 1 / 4) +
facet_grid(metric ~ experiment_id) +
scale_y_continuous(
labels = scales::percent
) +
theme(
panel.spacing.y = unit(0.5, "lines")
) +
guides(
color = guide_legend(ncol = 2),
fill = guide_legend(ncol = 2)
) +
labs(
x = "Normalized mean GFP (log10)",
y = "Scaled density",
color = "Cell status",
fill = "Cell status"
) +
NULLtracks_summary_df %>%
select(-sos_length) %>%
pivot_longer(
cols = contains("length"),
names_to = "metric",
values_to = "value"
) %>%
parse_metrics_column(metric) %>%
filter(!is.na(value)) %>%
ggplot(aes(x = value, color = cell_status, fill = cell_status)) +
geom_density(aes(y = ..scaled..), alpha = 1 / 4) +
facet_grid(metric ~ experiment_id) +
scale_y_continuous(
labels = scales::percent
) +
coord_cartesian(xlim = c(0, 150)) +
theme(
panel.spacing.y = unit(0.5, "lines")
) +
guides(
color = guide_legend(ncol = 2),
fill = guide_legend(ncol = 2)
) +
labs(
x = "Cell length",
y = "Scaled density",
color = "Cell status",
fill = "Cell status"
) +
NULLtracks_summary_df %>%
ggplot(aes(x = initial_gfp, y = initial_length, color = cell_status)) +
geom_point(alpha = 1/20) +
facet_grid(~experiment_id) +
guides(
color = guide_legend(ncol = 2, override.aes = list(alpha = 1)),
fill = guide_legend(ncol = 2)
) +
labs(
x = "Initial normalized GFP (log10)",
y = "Initial length",
color = "Cell status"
) +
NULLtracks_summary_df %>%
ggplot(aes(x = end_gfp - initial_gfp, y = end_length - initial_length, color = cell_status)) +
geom_point(alpha = 1/20) +
facet_grid(~experiment_id) +
guides(
color = guide_legend(ncol = 2, override.aes = list(alpha = 1)),
fill = guide_legend(ncol = 2)
) +
labs(
x = "End normalized GFP - Initial normalized GFP (log10)",
y = "End length - Initial length",
color = "Cell status"
) +
NULLtracks_summary_df %>%
ggplot(aes(x =factor(lived_time), fill = cell_status)) +
geom_bar(position = "fill", stat = "count", width = 1) +
facet_grid(experiment_id ~ .) +
scale_x_discrete(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0), labels = scales::percent) +
guides(
color = guide_legend(ncol = 2),
fill = guide_legend(ncol = 2)
) +
theme(
panel.spacing.y = unit(0.9, "lines"),
panel.grid = element_blank()
) +
labs(
x = "Cell life time",
y = "Percentage of cells",
fill = "Cell status"
) +
NULLtracks_summary_df %>%
select(experiment_id, id, lived_time, filamented_track, survived_strict, contains("gfp")) %>%
rename(survived = contains("survived")) %>%
pivot_longer(
cols = contains("gfp"),
names_to = "metric"
) %>%
filter(!is.na(value)) %>%
group_by(experiment_id, lived_time, filamented_track, survived, metric) %>%
summarize(
value = mean(value),
.groups = "drop"
) %>%
parse_metrics_column(metric) %>%
identity() %>%
ggplot(aes(x = lived_time, y = value)) +
geom_line(aes(group = lived_time)) +
geom_point(aes(color = metric, shape = survived)) +
facet_grid(filamented_track ~ experiment_id) +
scale_color_hue(direction = -1, h.start = 90) +
scale_fill_hue(direction = -1, h.start = 90) +
labs(
x = "Cell life time",
y = "Average normalized GFP value",
color = "Length type",
shape = "Cell status"
) +
theme(
panel.spacing.x = unit(0.9, "lines"),
) +
NULLtracks_summary_df %>%
select(experiment_id, id, lived_time, filamented_track, survived_strict, contains("length")) %>%
rename(survived = contains("survived")) %>%
pivot_longer(
cols = contains("length"),
names_to = "metric"
) %>%
filter(!is.na(value)) %>%
group_by(experiment_id, lived_time, filamented_track, survived, metric) %>%
summarize(
value = mean(value),
.groups = "drop"
) %>%
parse_metrics_column(metric) %>%
identity() %>%
ggplot(aes(x = lived_time, y = value)) +
geom_line(aes(group = lived_time)) +
geom_point(aes(color = metric, shape = survived)) +
facet_grid(filamented_track ~ experiment_id) +
scale_color_hue(direction = -1, h.start = 90) +
scale_fill_hue(direction = -1, h.start = 90) +
labs(
x = "Cell life time",
y = "Average length value",
color = "Length type",
shape = "Cell status"
) +
theme(
panel.spacing.x = unit(0.9, "lines"),
) +
NULLtracks_summary_df %>%
group_by(experiment_id, cell_status) %>%
summarise(
n = median(n_divisons),
.groups = "drop"
) %>%
ggplot(aes(x = fct_reorder(cell_status, n), y = n, color = cell_status, fill = cell_status)) +
geom_point() +
geom_segment(aes(xend = cell_status, y=0, yend = n)) +
facet_grid(. ~ experiment_id) +
theme(
panel.grid.major = element_blank(),
legend.position = "none"
) +
labs(
x = "Cell status",
y = "Median number of division"
) +
coord_flip() +
NULLtracks_summary_df %>%
filter(!is.na(time_since_last_divison_to_experiment_start)) %>%
ggplot(aes(x = factor(time_since_last_divison_to_experiment_start), fill = cell_status)) +
geom_bar(position = "dodge", stat = "count", color = "white") +
facet_grid(experiment_id ~ ., scales = "free_y") +
#scale_x_discrete(expand = c(0, 0)) +
#scale_y_continuous(expand = c(0, 0)) +
guides(
color = guide_legend(ncol = 2),
fill = guide_legend(ncol = 2)
) +
theme(
panel.spacing.y = unit(0.9, "lines"),
panel.grid.major = element_blank()
) +
labs(
x = "Time since last division to experiment start",
y = "Count of cells",
fill = "Cell status"
) +
NULLexperiment_datasets <- tracks_summary_df %>%
select(experiment_id, cell_status, contains("length"), contains("gfp"), -contains("sos"), -contains("end")) %>%
group_by(experiment_id) %>%
{
grouped_data <- .
group_split(grouped_data) %>%
set_names(nm = group_keys(grouped_data) %>% pull())
} %>%
map(select, -experiment_id) %>%
identity()
chromosome_df <- experiment_datasets$Chromosome
plasmid_df <- experiment_datasets$Plasmidc_pca_rec <- recipe(cell_status ~ ., data = chromosome_df) %>%
step_naomit(all_predictors()) %>%
step_normalize(all_predictors()) %>%
step_pca(all_predictors())
c_pca_prep <- prep(c_pca_rec)
c_pca_prep## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 2
##
## Training data contained 1061 data points and no missing data.
##
## Operations:
##
## Removing rows with NA values in all_predictors()
## Centering and scaling for initial_length, initial_gfp [trained]
## PCA extraction with initial_length, initial_gfp [trained]
c_tidied_pca <- tidy(c_pca_prep, 3)
c_tidied_pca %>%
filter(component %in% paste0("PC", 1:5)) %>%
mutate(component = fct_inorder(component)) %>%
ggplot(aes(value, terms, fill = terms)) +
geom_col(show.legend = FALSE) +
facet_wrap(~component, nrow = 1) +
labs(y = NULL)c_pca_prep %>%
juice() %>%
ggplot(aes(x = PC1, y = PC2, color = cell_status)) +
geom_vline(xintercept = 0) +
geom_hline(yintercept = 0) +
geom_jitter(size = 0.7) +
guides(
color = guide_legend(ncol = 2, override.aes = list(alpha = 1)),
fill = guide_legend(ncol = 2)
) +
labs(
color = "Cell status"
) +
NULLp_pca_rec <- recipe(cell_status ~ ., data = plasmid_df) %>%
step_naomit(all_predictors()) %>%
step_normalize(all_predictors()) %>%
step_pca(all_predictors())
p_pca_prep <- prep(p_pca_rec)
p_pca_prep## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 2
##
## Training data contained 12463 data points and no missing data.
##
## Operations:
##
## Removing rows with NA values in all_predictors()
## Centering and scaling for initial_length, initial_gfp [trained]
## PCA extraction with initial_length, initial_gfp [trained]
p_tidied_pca <- tidy(p_pca_prep, 3)
p_tidied_pca %>%
filter(component %in% paste0("PC", 1:5)) %>%
mutate(component = fct_inorder(component)) %>%
ggplot(aes(value, terms, fill = terms)) +
geom_col(show.legend = FALSE) +
facet_wrap(~component, nrow = 1) +
labs(y = NULL)p_pca_prep %>%
juice() %>%
ggplot(aes(x = PC1, y = PC2, color = cell_status)) +
geom_vline(xintercept = 0) +
geom_hline(yintercept = 0) +
geom_jitter(size = 0.1, alpha = 1/3) +
scale_x_continuous(limits = c(NA, 5)) +
scale_y_continuous(limits = c(NA, 7)) +
guides(
color = guide_legend(ncol = 2, override.aes = list(alpha = 1, size = 1)),
fill = guide_legend(ncol = 2)
) +
labs(
color = "Cell status"
) +
NULLp_umap_rec <- recipe(cell_status ~ ., data = plasmid_df) %>%
step_naomit(all_predictors()) %>%
step_normalize(all_predictors()) %>%
step_umap(all_predictors())
p_umap_prep <- prep(p_umap_rec)
p_umap_prep## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 2
##
## Training data contained 12463 data points and no missing data.
##
## Operations:
##
## Removing rows with NA values in all_predictors()
## Centering and scaling for initial_length, initial_gfp [trained]
## UMAP embedding for initial_length, initial_gfp [trained]
juice(p_umap_prep) %>%
ggplot(aes(umap_1, umap_2, label )) +
geom_point(aes(color = cell_status), alpha = 0.7, size = 2) +
labs(
x = "UMAP 1",
y = "UMAP 2",
color = "Cell status"
) +
guides(
color = guide_legend(ncol = 2),
fill = guide_legend(ncol = 2)
) +
NULLc_umap_rec <- recipe(cell_status ~ ., data = chromosome_df) %>%
step_naomit(all_predictors()) %>%
step_normalize(all_predictors()) %>%
step_umap(all_predictors())
c_umap_prep <- prep(c_umap_rec)
c_umap_prep## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 2
##
## Training data contained 1061 data points and no missing data.
##
## Operations:
##
## Removing rows with NA values in all_predictors()
## Centering and scaling for initial_length, initial_gfp [trained]
## UMAP embedding for initial_length, initial_gfp [trained]
juice(c_umap_prep) %>%
ggplot(aes(umap_1, umap_2, label )) +
geom_point(aes(color = cell_status), alpha = 0.7, size = 2) +
guides(
color = guide_legend(ncol = 2),
fill = guide_legend(ncol = 2)
) +
labs(
x = "UMAP 1",
y = "UMAP 2",
color = "Cell status"
)chromosome_df %>%
select(cell_status, where(is.numeric)) %>%
rename(`Initial GFP` = initial_gfp, `Initial length` = initial_length) %>%
ggpairs(
data = .,
columns = 2:ncol(.),
aes(color = cell_status, alpha = 1/1000)
)plasmid_df %>%
select(cell_status, where(is.numeric)) %>%
rename(`Initial GFP` = initial_gfp, `Initial length` = initial_length) %>%
ggpairs(
data = .,
columns = 2:ncol(.),
aes(color = cell_status, alpha = 1/1000),
#upper = list(continuous = "points")
)